Date: Mon Mar 30 13:35:28 2020
Scientist: Ran Yin
Sequencing (Waksman): Dibyendu Kumar
Statistics: Davit Sargsyan
Principal Investigator: Ah-Ng Kong

# Taxonomic Ranks:
# **K**ing **P**hillip **C**an n**O**t **F**ind **G**reen **S**ocks
# * Kingdom                
# * Phylum                    
# * Class                   
# * Order                   
# * Family     
# * Genus     
# * Species  
options(stringsAsFactors = FALSE,
        scipen = 999)
# # Increase mmemory size to 64 Gb----
# invisible(utils::memory.limit(65536))
# str(knitr::opts_chunk$get())
# # NOTE: the below does not work!
# knitr::opts_chunk$set(echo = FALSE, 
#                       message = FALSE,
#                       warning = FALSE,
#                       error = FALSE)
# require(knitr)
# require(kableExtra)
# require(shiny)
require(phyloseq)
require(data.table)
require(ggplot2)
require(plotly)
require(DT)
source("source/functions_may2019.R")
# On Windows set multithread=FALSE----
mt <- TRUE

1 Introduction

November 2018 Batch: Nrf2 KO (-/-) Mice
May 2019 Batch: Wild Type Mice

2 Data preprocessing

2.1 Raw Data

FastQ files were downloaded from Dr. Kumar’s DropBox. A total of 60 files (2 per sample, pair-ended) and 2 metadata files were downloaded.

2.2 Script

Data processing scripts (nrf2ubiome_dada2_nov2018_v1.Rmd and nrf2ubiome_dada2_may2019_v1.Rmd) were developed using DADA2 Pipeline Tutorial (1.12) with tips and tricks from the University of Maryland Shool of Medicine Institute for Genome Sciences (IGS) Microbiome Analysis Workshop (April 8-11, 2019). The oresults of the DADA2 scripts (data_nov2018/ps_nov2018.RData and data_may2019/ps_may2019.RData) are explored in this document.

3 Meta data: sample description

# Load data----
# Counts
load("data_nov2018/ps_nov2018.RData")
ps_nov2018 <- copy(ps)
rm(ps)
# Remove "Undetermined" sample
ps_nov2018 <- subset_samples(ps_nov2018, 
                             Name != "Undetermined_S0")
load("data_may2019/ps_may2019.RData")
gc(verbose = FALSE)
          used  (Mb) gc trigger  (Mb) max used  (Mb)
Ncells 4019072 214.7    7165308 382.7  6470341 345.6
Vcells 7844836  59.9   14786712 112.9 12249660  93.5
# Taxonomy (use the same one for both batches!)
load("data_may2019/taxa.RData")
taxa <- data.table(seq16s = rownames(taxa),
                   taxa)
# Samples
meta18 <- ps_nov2018@sam_data
datatable(meta18,
          options = list(pageLength = nrow(meta18)),
          caption = "Nrf2 KO (Nov 18) Meta Data")

meta19 <- ps_may2019@sam_data
datatable(meta19,
          options = list(pageLength = nrow(meta19)),
          caption = "WT (May 19) Meta Data")

4 OTUs mapped to Kingdoms, each batch separate

5 Merge the 2 data sets

pss <- merge_phyloseq(ps_nov2018,
                      ps_may2019)
tmp <- pss@sam_data
# Diet
meta <- data.table(Sample = rownames(pss@sam_data),
                   Diet = tmp$Diet)
meta$Diet <- gsub(x = meta$Diet, 
                  pattern = " Control",
                  replacement = "")
meta$Diet[!is.na(tmp$TREATMENT)] <- tmp$TREATMENT[!is.na(tmp$TREATMENT)]
meta$Diet <- gsub(x = meta$Diet,
                  pattern = "Control",
                  replacement = "AIN93M")
meta$Diet[is.na(meta$Diet)] <- "Pooled"
meta$Diet <- factor(meta$Diet,
                    levels = c("AIN93M",
                               "PEITC",
                               "Pooled"))
# Gemotype
meta$Genotype <- "Nrf2 KO"
meta$Genotype[is.na(tmp$Sex)] <- "Wild Type"
meta$Genotype <- factor(meta$Genotype,
                        levels = c("Wild Type",
                                   "Nrf2 KO"))
# Time
meta$Week <- as.numeric(as.character(tmp$Week)) - 4
meta$Week[meta$Genotype == "Wild Type"] <- as.numeric(gsub(x = tmp$WEEK[meta$Genotype == "Wild Type"],
                                                           pattern = "week ",
                                                           replacement = ""))
meta$Week <- paste("Week",
                   meta$Week)
meta$Week <- factor(meta$Week,
                    levels = c("Week 0",
                               "Week 1",
                               "Week 4",
                               "Week 5"))
# Mouse ID
meta$Mouse_Num <- as.numeric(as.character(tmp$MouseNum))
meta$Mouse_Num[meta$Genotype == "Wild Type"] <- as.numeric(substr(x = tmp$SAMPLE_NAME[meta$Genotype == "Wild Type"],
                                                                  start = 3, 
                                                                  stop = 3))
# Cage number
meta$Cage <- as.character(tmp$Cage)
meta$Cage[meta$Genotype == "Wild Type"] <- substr(x = tmp$SAMPLE_NAME[meta$Genotype == "Wild Type"],
                                                  start = 2, 
                                                  stop = 2)
meta$Cage <- factor(meta$Cage)
meta <- data.frame(meta)
rownames(meta) <- meta$Sample
meta
# Edit meta data
sample_data(pss) <- meta
pss@sam_data
Sample Data:        [69 samples by 6 sample variables]:
dim(pss@otu_table@.Data)
[1]    69 17046
# Remove OTU unmapped to Bacteria
ps0 <- subset_taxa(pss, 
                   Kingdom == "Bacteria")
dim(ps0@otu_table@.Data)
[1]    69 16351

6 OTU table (first 10 rows)

7 Total counts per sample (i.e. sequencing depth)

p1 <- ggplot(smpl,
             aes(x = Sample,
                 y = Total,
                 group =Diet,
                 fill = Diet)) +
  facet_wrap(~ Genotype + Week,
             scale = "free") +
  geom_bar(stat = "identity",
           color = "black") +
  scale_x_discrete("") +
  scale_y_continuous("Number of Reads") +
  scale_fill_grey("Treatment", 
                  start = 0.1, 
                  end = 1,
                  na.value = "red",
                  aesthetics = "fill") +
  theme_bw() + 
  theme(panel.border = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.title.x = element_blank(),
        # axis.text.x = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        # axis.ticks.x=element_blank(),
        legend.position = "top")
tiff(filename = "tmp/seq_depth_nov2018_may2019.tiff",
     height =6,
     width = 8,
     units = "in",
     res = 600,
     compression = "lzw+p")
print(p1)
graphics.off()
print(p1)

t2 <- data.table(table(tax_table(ps0)[, "Phylum"],
                                  exclude = NULL))
t2$V1[is.na(t2$V1)] <- "Unknown"
setorder(t2, -N)
t2[, pct := N/sum(N)]
setorder(t2, -N)
colnames(t2) <- c("Phylum",
                  "Number of OTUs",
                  "Percent of OTUs")
datatable(t2,
          rownames = FALSE,
          caption = "Number of Bacterial OTUs by Phylum",
          class = "cell-border stripe",
          options = list(search = FALSE,
                         pageLength = nrow(t2))) %>%
  formatCurrency(columns = 2,
                 currency = "",
                 mark = ",",
                 digits = 0) %>%
  formatPercentage(columns = 3,
                   digits = 2)

ps1 <- subset_taxa(ps0, 
                   !is.na(Phylum))
dim(ps1@otu_table@.Data)
[1]    69 15919

8 Remove Phylum

Remove:
1. Unmapped OTUs (“Unknown”).
2. Cyanobacteria: aerobic, photosynthesizing bacteria that probably got into the sample through food.
NOTE: Chloroflexi might be ok.

ps0 <- subset_taxa(ps0,
                   !(Phylum %in% c("Unknown",
                                   "Cyanobacteria")))

9 Richness (Alpha diversity)

Shannon index (aka Shannon enthrophy) is calculated as:
H’ = -sum(1 to R)p(i)ln(p(i)) When there is exactly 1 type of data (e.g. a single species in the sample), H’=0. The opposite scenario is when there are R>1 species present in the sample in the exact same amounts and H’=ln(R).

Shannon’s diversity index was calculated for each sample and ploted over time.

shannon.ndx <- estimate_richness(ps0,
                                 measures = "Shannon")
shannon.ndx <- data.table(Sample = rownames(shannon.ndx),
                          shannon.ndx)
smpl <- merge(smpl,
              shannon.ndx,
              by = "Sample")
p1 <- ggplot(smpl,
             aes(x = Total,
                 y = Shannon,
                 fill = Genotype,
                 shape = Week)) +
  geom_point(size = 2) +
  scale_shape_manual(breaks = unique(smpl$Week),
                     values = 21:24)
tiff(filename = "tmp/shannon_vs_depth_nov18_may19.tiff",
     height = 5,
     width = 6,
     units = "in",
     res = 600,
     compression = "lzw+p")
print(p1)
graphics.off()
ggplotly(p1)

Even though estimate_richness function does not adjust for the sequencing depth, there is no correlation between the index and the sample’s sequecing depth. Proceed with the comparison.

10 Shannon idex over time

p1 <- plot_richness(ps0,
                    x = "Week", 
                    measures = "Shannon") +
  facet_wrap(~ Genotype,
             scale = "free_x") +
  geom_line(aes(group = paste0(Diet,
                               Cage,
                               Mouse_Num)),
            color = "black") +
  geom_point(aes(fill = Diet),
             shape = 21,
             size = 3,
             color = "black") +
  scale_x_discrete("") +
  theme(axis.text.x = element_text(angle = 30,
                                   hjust = 1,
                                   vjust = 1))
ggplotly(p = p1,
         tooltip = c("Mouse_Num",
                     "value"))

p1 <- p1 + 
  scale_fill_discrete("") +
  theme(legend.position = "top")
tiff(filename = "tmp/shannon_nov18_may19.tiff",
     height = 5,
     width = 8,
     units = "in",
     res = 600,
     compression = "lzw+p")
print(p1)
graphics.off()

The plot above suggests that the largest differences in alpha diversity (as measured by Shannon’s index) are in genotype. THis was also confirmed in the 3rd study (September 2019 batch)

11 Average Shannon Index

# Average shannon index by treatment group
tmp <- data.table(copy(smpl))
tmp[, mu := mean(Shannon),
    by = list(Diet,
              Genotype,
              Week)]
tmp[, sem := sd(Shannon)/sqrt(.N),
    by = list(Diet,
              Genotype,
              Week)]
tmp <- unique(tmp[, c("Diet",
                      "Genotype",
                      "Week",
                      "mu",
                      "sem")])
p1 <- ggplot(tmp,
             aes(x = Week,
                 y = mu,
                 ymin = mu - sem,
                 ymax = mu + sem,
                 fill = Diet,
                 group = Diet)) +
  facet_wrap(~ Genotype,
             scale = "free_x") +
  geom_errorbar(position = position_dodge(0.3),
                width = 0.4) +
  geom_line(position = position_dodge(0.3)) +
  geom_point(size = 3,
             shape = 21,
             position = position_dodge(0.3)) +
  scale_x_discrete("") +
  scale_y_continuous("Shannon Index") +
  scale_fill_grey("Treatment", 
                  start = 0, 
                  end = 1,
                  na.value = "red",
                  aesthetics = "fill") +
  theme_bw() + 
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        # panel.border = element_blank(), 
        axis.title.x = element_blank(),
        # axis.text.x = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        axis.ticks.x=element_blank(),
        legend.position = "top")
tiff(filename = "tmp/avg_shannon_nov18_may19.tiff",
     height = 4,
     width = 6,
     units = "in",
     res = 600,
     compression = "lzw+p")
print(p1)
graphics.off()
print(p1)

NOTE: cannot test diet effect because of unassigend pooled samples in the KO mice. For the same reason, cannot use mixed effects model on the whole data set. At best, testing genotype and time trend by assuming Week 4 in WT = Week 5 in KO.

smpl$Timepoint <- as.numeric(smpl$Week)
smpl$Timepoint[smpl$Timepoint == 4] <- 3
m1 <- lm(Shannon ~ Timepoint*Genotype,
         data = smpl)
summary(m1)

Call:
lm(formula = Shannon ~ Timepoint * Genotype, data = smpl)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.45931 -0.09344  0.02849  0.08560  0.34790 

Coefficients:
                          Estimate Std. Error t value            Pr(>|t|)    
(Intercept)                6.55295    0.08207  79.842 <0.0000000000000002 ***
Timepoint                  0.04053    0.03799   1.067              0.2900    
GenotypeNrf2 KO            0.35620    0.13511   2.636              0.0105 *  
Timepoint:GenotypeNrf2 KO  0.09221    0.05778   1.596              0.1154    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.1699 on 65 degrees of freedom
Multiple R-squared:  0.7687,    Adjusted R-squared:  0.7581 
F-statistic: 72.02 on 3 and 65 DF,  p-value: < 0.00000000000000022

No significant time trend; significantly higher alpha diversity in the Nrf2 KO mice. This is possibly a batch effect but we observed same trend in Study 3 (Crabbery and PEITC with both genotypes in the same study).

12 Bacteriotides vs. Firmicutes

counts_p <- counts_by_tax_rank(dt1 = otu,
                               aggr_by = "Phylum")
fb <- t(counts_p[Phylum %in% c("Bacteroidetes",
                               "Firmicutes"), -1])
fb <- data.table(Sample = rownames(fb),
                 Bacteroidetes = fb[, 1],
                 Firmicutes = fb[, 2])
fb <- data.table(merge(meta,
            fb,
            by = "Sample"))
p1 <- ggplot(fb,
             aes(x = log2(Bacteroidetes),
                 y = log2(Firmicutes),
                 fill = Genotype)) +
  geom_point(size = 2,
             color = "black",
             shape = 21)
p2 <- ggplot(fb,
             aes(x = log2(Bacteroidetes),
                 y = log2(Firmicutes),
                 fill = Week)) +
  geom_point(size = 2,
             color = "black",
             shape = 21)
p3 <- ggplot(fb,
             aes(x = log2(Bacteroidetes),
                 y = log2(Firmicutes),
                 fill = Diet)) +
  geom_point(size = 2,
             color = "black",
             shape = 21)
p4 <- ggplot(fb,
             aes(x = Week,
                 y = Bacteroidetes/Firmicutes,
                 fill = Diet,
                 group = paste0(Diet,
                               Cage,
                               Mouse_Num))) +
  facet_grid(~ Genotype,
             scale = "free_x") +
  geom_hline(yintercept = 1,
             linetype = "dashed") +
  geom_line(position = position_dodge(0.3)) +
  geom_point(size = 2,
             color = "black",
             shape = 21,
             position = position_dodge(0.3))  +
  theme_bw() + 
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        # panel.border = element_blank(), 
        axis.title.x = element_blank(),
        # axis.text.x = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        axis.ticks.x=element_blank(),
        legend.position = "top")
tiff(filename = "tmp/bact_vs_firm_nov18_may19.tiff",
     height = 6,
     width =8,
     units = "in",
     res = 600,
     compression = "lzw+p")
gridExtra::grid.arrange(p1, p2, p3, p4)
graphics.off()
gridExtra::grid.arrange(p1, p2, p3, p4)

fb[, mu := mean(Bacteroidetes/Firmicutes),
   by = c("Diet",
          "Genotype",
          "Week")]
fb[, sem := sd(Bacteroidetes/Firmicutes)/sqrt(.N),
   by = c("Diet",
          "Genotype",
          "Week")]
mufb <- unique(fb[, c("Diet",
                      "Genotype",
                      "Week",
                      "mu",
                      "sem")])
p1 <- ggplot(mufb,
             aes(x = Week,
                 y = mu,
                 ymin = mu - sem,
                 ymax = mu + sem,
                 fill = Diet,
                 group = Diet)) +
  facet_wrap(~ Genotype,
             scale = "free_x") +
  geom_hline(yintercept = 1,
             linetype = "dashed") +
  geom_errorbar(position = position_dodge(0.3),
                width = 0.4) +
  geom_line(position = position_dodge(0.3)) +
  geom_point(size = 3,
             shape = 21,
             position = position_dodge(0.3)) +
  scale_x_discrete("") +
  scale_y_continuous("Bacteroidetes/Firmicutes") +
  scale_fill_grey("Treatment", 
                  start = 0, 
                  end = 1,
                  na.value = "red",
                  aesthetics = "fill") +
  theme_bw() + 
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        # panel.border = element_blank(), 
        axis.title.x = element_blank(),
        # axis.text.x = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        axis.ticks.x=element_blank(),
        legend.position = "top")
tiff(filename = "tmp/avg_bact_firm_nov18_may19.tiff",
     height = 4,
     width = 6,
     units = "in",
     res = 600,
     compression = "lzw+p")
print(p1)
graphics.off()
print(p1)

mufb[, est := paste0(round(mu, 2),
                     "(",
                     round(sem, 2),
                     ")")]
t1 <- dcast.data.table(mufb,
                       Genotype + Diet ~ Week,
                       value.var = "est")
datatable(t1,
          rownames = FALSE,
          caption = "Average Ratio of Bacteroidetes and Firmicutes",
          class = "cell-border stripe",
          options = list(search = FALSE,
                         pageLength = nrow(t1)))

13 Update OTU table: excuded unknown phylum and Cyanobacteria

otu <- data.table(ps0@tax_table@.Data,
                  t(ps0@otu_table@.Data))
# Remove Species mapping'
otu$Species <- NULL
dim(otu)
[1] 16221    75

14 1. Phylum

14.1 Counts at Phylum level

14.2 Relative abundance (%) at Phylum level

14.3 PCA at Phylum level

dt_pca <- t(ra_p[, 2:ncol(ra_p)])
There were 28 warnings (use warnings() to see them)
colnames(dt_pca) <- ra_p$Phylum
dt_pca_p <- data.table(Sample = rownames(dt_pca),
                       dt_pca)
dt_pca_p <- merge(smpl,
                  dt_pca_p,
                  by = "Sample")
# Keep only the phylum with non-zero counts
tmp <- dt_pca_p[, 10:ncol(dt_pca_p)]
keep_p <- colnames(tmp)[colSums(tmp) > 0]
dt_pca <- dt_pca[, keep_p]
# m1 <- prcomp(dt_pca,
#              center = TRUE,
#              scale. = TRUE)
# m1 <- prcomp(dt_pca,
#              center = FALSE,
#              scale. = FALSE)
m1 <- prcomp(dt_pca,
             center = TRUE,
             scale. = FALSE)
summary(m1)
Importance of components:
                          PC1     PC2     PC3     PC4     PC5      PC6      PC7       PC8
Standard deviation     0.1016 0.07982 0.05398 0.03765 0.01000 0.008812 0.002823 0.0004897
Proportion of Variance 0.4864 0.30051 0.13743 0.06688 0.00472 0.003660 0.000380 0.0000100
Cumulative Proportion  0.4864 0.78693 0.92435 0.99123 0.99595 0.999610 0.999990 1.0000000
                             PC9       PC10       PC11      PC12       PC13
Standard deviation     0.0001899 0.00008382 0.00003105 0.0000258 0.00002073
Proportion of Variance 0.0000000 0.00000000 0.00000000 0.0000000 0.00000000
Cumulative Proportion  1.0000000 1.00000000 1.00000000 1.0000000 1.00000000
# Select PC-s to pliot (PC1 & PC2)
choices <- 1:2
# Add meta data
dt.scr <- data.table(m1$x[, choices])
dt.scr$Sample <- rownames(m1$x)
dt.scr <- merge(smpl,
                dt.scr,
                by = "Sample")
dt.scr
# Loadings, i.e. arrows (df.v)
dt.rot <- as.data.frame(m1$rotation[, choices])
dt.rot$feat <- rownames(dt.rot)
dt.rot <- data.table(dt.rot)
dt.rot
dt.load <- melt.data.table(dt.rot,
                           id.vars = "feat",
                           measure.vars = 1:2,
                           variable.name = "pc",
                           value.name = "loading")
dt.load$feat <- factor(dt.load$feat,
                       levels = unique(dt.load$feat))
# Plot loadings
p0 <- ggplot(data = dt.load,
             aes(x = feat,
                 y = loading)) +
  facet_wrap(~ pc,
             nrow = 2) +
  geom_bar(stat = "identity") +
  ggtitle("PC Loadings") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1))
p0
tiff(filename = "tmp/pc.1.2_loadings_phylum.tiff",
     height = 5,
     width = 6,
     units = 'in',
     res = 300,
     compression = "lzw+p")
print(p0)
graphics.off()
print(p0)

# Axis labels
u.axis.labs <- paste(colnames(dt.rot)[1:2], 
                     sprintf('(%0.1f%% explained var.)', 
                             100*m1$sdev[choices]^2/sum(m1$sdev^2)))
u.axis.labs
[1] "PC1 (48.6% explained var.)" "PC2 (30.1% explained var.)"
cntr <- data.table(aggregate(x = dt.scr$PC1,
                             by = list(dt.scr$Genotype),
                             FUN = "mean"),
                   aggregate(x = dt.scr$PC2,
                             by = list(dt.scr$Genotype),
                             FUN = "mean")$x)
colnames(cntr) <- c("Genotype",
                    "PC1",
                    "PC2")
# Based on Figure p0, keep only a few variables with high loadings in PC1 and PC2----
dt.rot[, rating:= (PC1)^2 + (PC2)^2]
setorder(dt.rot, -rating)
# Select top 5
dt.rot <- dt.rot[1:5, ]
# var.keep.ndx <- which(dt.rot$feat %in% c(...))
# Or select all
# var.keep.ndx <- 3:ncol(dt1)
# Use dt.rot[var.keep.ndx,] and dt.rot$feat[var.keep.ndx]
p1 <- ggplot(data = dt.rot,
             aes(x = PC1,
                 y = PC2)) +
  coord_equal() +
  geom_point(data = dt.scr,
             aes(fill = Genotype,
                 shape = factor(Timepoint)),
             size = 3,
             alpha = 0.5) +
  geom_segment(aes(x = 0,
                   y = 0,
                   xend = 0.2*PC1,
                   yend = 0.2*PC2),
               arrow = arrow(length = unit(1/2, 'picas')),
               # size = 1, 
               color = "black") +
  geom_text(aes(x = 0.22*PC1,
                y = 0.22*PC2,
                label = dt.rot$feat),
            # size = 5,
            hjust = 0.5) +
  scale_x_continuous(u.axis.labs[1]) +
  scale_y_continuous(u.axis.labs[2]) +
  scale_fill_manual(name = "Group",
                    breaks = c("Wild Type",
                               "Nrf2 KO"),
                    values = c("red",
                               "blue")) +
  scale_shape_manual(breaks = 1:3,
                     values = 21:23) +
  geom_label(data = cntr,
             aes(x = PC1,
                 y = PC2,
                 label = Genotype,
                 colour = Genotype),
             alpha = 0.5,
             size = 3) +
  scale_color_manual(guide = FALSE,
                     breaks = c("Wild Type",
                                "Nrf2 KO"),
                     values = c("red",
                                "blue")) +
  ggtitle("") +
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 20),
        legend.position = "none") 
tiff(filename = "tmp/phylum_biplot_grp.tiff",
     height = 8,
     width = 8,
     units = 'in',
     res = 300,
     compression = "lzw+p")
print(p1)
graphics.off()
ggplotly(p1)
geom_GeomLabel() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issuesgeom_GeomLabel() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues

# Generci biplot
biplot(m1)

15 2. Class

15.1 Counts at Class level

15.2 Relative abundance (%) at Class level

15.3 PCA at Class level

dt_pca <- t(ra_c[, 2:ncol(ra_c)])
colnames(dt_pca) <- ra_c$Class
dt_pca_c <- data.table(Sample = rownames(dt_pca),
                       dt_pca)
dt_pca_c <- merge(smpl,
                  dt_pca_c,
                  by = "Sample")
# Keep only the Class with non-zero counts
tmp <- dt_pca_c[, 10:ncol(dt_pca_c)]
keep_c <- colnames(tmp)[colSums(tmp) > 0]
dt_pca <- dt_pca[, keep_c]
m1 <- prcomp(dt_pca,
             center = TRUE,
             scale. = FALSE)
summary(m1)
Importance of components:
                           PC1     PC2    PC3     PC4     PC5     PC6     PC7      PC8
Standard deviation     0.08999 0.07999 0.0740 0.05046 0.03517 0.02037 0.01466 0.009891
Proportion of Variance 0.32837 0.25949 0.2220 0.10327 0.05015 0.01682 0.00872 0.003970
Cumulative Proportion  0.32837 0.58786 0.8099 0.91318 0.96333 0.98016 0.98887 0.992840
                            PC9     PC10     PC11    PC12     PC13      PC14      PC15
Standard deviation     0.008728 0.007865 0.005555 0.00265 0.000615 0.0004527 0.0001791
Proportion of Variance 0.003090 0.002510 0.001250 0.00028 0.000020 0.0000100 0.0000000
Cumulative Proportion  0.995930 0.998440 0.999690 0.99997 0.999990 1.0000000 1.0000000
                             PC16      PC17      PC18       PC19       PC20       PC21
Standard deviation     0.00008448 0.0000719 0.0000276 0.00002248 0.00002123 0.00001378
Proportion of Variance 0.00000000 0.0000000 0.0000000 0.00000000 0.00000000 0.00000000
Cumulative Proportion  1.00000000 1.0000000 1.0000000 1.00000000 1.00000000 1.00000000
                             PC22       PC23        PC24
Standard deviation     0.00001094 0.00001006 0.000009136
Proportion of Variance 0.00000000 0.00000000 0.000000000
Cumulative Proportion  1.00000000 1.00000000 1.000000000
# Select PC-s to pliot (PC1 & PC2)
choices <- 1:2
# Add meta data
dt.scr <- data.table(m1$x[, choices])
dt.scr$Sample <- rownames(m1$x)
dt.scr <- merge(smpl,
                dt.scr,
                by = "Sample")
dt.scr
# Loadings, i.e. arrows (df.v)
dt.rot <- as.data.frame(m1$rotation[, choices])
dt.rot$feat <- rownames(dt.rot)
dt.rot <- data.table(dt.rot)
dt.rot
dt.load <- melt.data.table(dt.rot,
                           id.vars = "feat",
                           measure.vars = 1:2,
                           variable.name = "pc",
                           value.name = "loading")
dt.load$feat <- factor(dt.load$feat,
                       levels = unique(dt.load$feat))
# Plot loadings
p0 <- ggplot(data = dt.load,
             aes(x = feat,
                 y = loading)) +
  facet_wrap(~ pc,
             nrow = 2) +
  geom_bar(stat = "identity") +
  ggtitle("PC Loadings") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1))
p0
tiff(filename = "tmp/pc.1.2_loadings_Class.tiff",
     height = 5,
     width = 6,
     units = 'in',
     res = 300,
     compression = "lzw+p")
print(p0)
graphics.off()
print(p0)

# Axis labels
u.axis.labs <- paste(colnames(dt.rot)[1:2], 
                     sprintf('(%0.1f%% explained var.)', 
                             100*m1$sdev[choices]^2/sum(m1$sdev^2)))
u.axis.labs
[1] "PC1 (32.8% explained var.)" "PC2 (25.9% explained var.)"
cntr <- data.table(aggregate(x = dt.scr$PC1,
                             by = list(dt.scr$Genotype),
                             FUN = "mean"),
                   aggregate(x = dt.scr$PC2,
                             by = list(dt.scr$Genotype),
                             FUN = "mean")$x)
colnames(cntr) <- c("Genotype",
                    "PC1",
                    "PC2")
# Based on Figure p0, keep only a few variables with high loadings in PC1 and PC2----
dt.rot[, rating:= (PC1)^2 + (PC2)^2]
setorder(dt.rot, -rating)
# Select top 8
dt.rot <- dt.rot[1:8, ]
# var.keep.ndx <- which(dt.rot$feat %in% c(...))
# Or select all
# var.keep.ndx <- 3:ncol(dt1)
# Use dt.rot[var.keep.ndx,] and dt.rot$feat[var.keep.ndx]
p1 <- ggplot(data = dt.rot,
             aes(x = PC1,
                 y = PC2)) +
  coord_equal() +
  geom_point(data = dt.scr,
             aes(fill = Genotype,
                 shape = factor(Timepoint)),
             size = 3,
             alpha = 0.5) +
  geom_segment(aes(x = 0,
                   y = 0,
                   xend = 0.2*PC1,
                   yend = 0.2*PC2),
               arrow = arrow(length = unit(1/2, 'picas')),
               # size = 1, 
               color = "black") +
  geom_text(aes(x = 0.22*PC1,
                y = 0.22*PC2,
                label = dt.rot$feat),
            # size = 5,
            hjust = 0.5) +
  scale_x_continuous(u.axis.labs[1]) +
  scale_y_continuous(u.axis.labs[2]) +
  scale_fill_manual(name = "Group",
                    breaks = c("Wild Type",
                               "Nrf2 KO"),
                    values = c("red",
                               "blue")) +
  scale_shape_manual(breaks = 1:3,
                     values = 21:23) +
  geom_label(data = cntr,
             aes(x = PC1,
                 y = PC2,
                 label = Genotype,
                 colour = Genotype),
             alpha = 0.5,
             size = 3) +
  scale_color_manual(guide = FALSE,
                     breaks = c("Wild Type",
                                "Nrf2 KO"),
                     values = c("red",
                                "blue")) +
  ggtitle("") +
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 20),
        legend.position = "none") 
tiff(filename = "tmp/class_biplot_gen.tiff",
     height = 8,
     width = 8,
     units = 'in',
     res = 300,
     compression = "lzw+p")
print(p1)
graphics.off()
ggplotly(p1)
geom_GeomLabel() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issuesgeom_GeomLabel() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues

# Generic biplot
biplot(m1)

15.4 3. Order

15.5 4. Family

15.6 5 Geus

15.7 Counts at Genus level

15.8 Relative abundance (%) at Genus level

Error in sum(a) : invalid 'type' (character) of argument

16 Session Information

sessionInfo()
---
title: "Nrf2 BL6 Wild-Type (WT) PEITC 16S Microbiome Data Visualization"
output: 
  html_notebook:
    toc: yes
    toc_float: yes
    number_sections: yes
    code_folding: hide
---
Date: `r date()`     
Scientist: [Ran Yin](mailto:ry147@scarletmail.rutgers.edu)      
Sequencing (Waksman): [Dibyendu Kumar](mailto:dk@waksman.rutgers.edu)      
Statistics: [Davit Sargsyan](mailto:sargdavid@gmail.com)      
Principal Investigator: [Ah-Ng Kong](mailto:kongt@pharmacy.rutgers.edu) 

```{}
# Taxonomic Ranks:
# **K**ing **P**hillip **C**an n**O**t **F**ind **G**reen **S**ocks
# * Kingdom                
# * Phylum                    
# * Class                   
# * Order                   
# * Family     
# * Genus     
# * Species  
```

```{r setup}
options(stringsAsFactors = FALSE,
        scipen = 999)

# # Increase mmemory size to 64 Gb----
# invisible(utils::memory.limit(65536))
# str(knitr::opts_chunk$get())
# # NOTE: the below does not work!
# knitr::opts_chunk$set(echo = FALSE, 
#                       message = FALSE,
#                       warning = FALSE,
#                       error = FALSE)

# require(knitr)
# require(kableExtra)
# require(shiny)

require(phyloseq)
require(data.table)
require(ggplot2)
require(plotly)
require(DT)

source("source/functions_may2019.R")

# On Windows set multithread=FALSE----
mt <- TRUE
```

# Introduction
November 2018 Batch: Nrf2 KO (-/-) Mice  
May 2019 Batch: Wild Type Mice  

# Data preprocessing
## Raw Data 
FastQ files were downloaded from [Dr. Kumar's DropBox](https://www.dropbox.com/sh/sm9tinm0f5r6y1v/AADjGPRRNiIM7zMSfANDkQjFa?dl=0). A total of 60 files (2 per sample, pair-ended) and 2 metadata files were downloaded.

## Script
Data processing scripts (***nrf2ubiome_dada2_nov2018_v1.Rmd*** and ***nrf2ubiome_dada2_may2019_v1.Rmd***) were developed using [DADA2 Pipeline Tutorial (1.12)](https://benjjneb.github.io/dada2/tutorial.html) with tips and tricks from the [University of Maryland Shool of Medicine Institute for Genome Sciences (IGS)](http://www.igs.umaryland.edu/) [Microbiome Analysis Workshop (April 8-11, 2019)](http://www.igs.umaryland.edu/education/wkshp_metagenome.php). The oresults of the DADA2 scripts (***data_nov2018/ps_nov2018.RData*** and ***data_may2019/ps_may2019.RData***) are explored in this document.

# Meta data: sample description
```{r data}
# Load data----
# Counts
load("data_nov2018/ps_nov2018.RData")
ps_nov2018 <- copy(ps)
rm(ps)
# Remove "Undetermined" sample
ps_nov2018 <- subset_samples(ps_nov2018, 
                             Name != "Undetermined_S0")

load("data_may2019/ps_may2019.RData")

gc(verbose = FALSE)

# Taxonomy (use the same one for both batches!)
load("data_may2019/taxa.RData")
taxa <- data.table(seq16s = rownames(taxa),
                   taxa)

# Samples
meta18 <- ps_nov2018@sam_data
datatable(meta18,
          options = list(pageLength = nrow(meta18)),
          caption = "Nrf2 KO (Nov 18) Meta Data")

meta19 <- ps_may2019@sam_data
datatable(meta19,
          options = list(pageLength = nrow(meta19)),
          caption = "WT (May 19) Meta Data")
```

# OTUs mapped to Kingdoms, each batch separate
```{r mapping_kingdom_by_batch, warning = FALSE, echo = FALSE, message = FALSE}
t1 <- data.table(table(tax_table(ps_nov2018)[,
                                             "Kingdom"],
                       exclude = NULL))
t2 <- data.table(table(tax_table(ps_may2019)[,
                                             "Kingdom"],
                       exclude = NULL))
tt1 <- merge(t1,
             t2,
             by = "V1",
             all = TRUE)

tt1$V1[is.na(tt1$V1)] <- "Unknown"
tt1$N.y[is.na(tt1$N.y)] <- 0


tt1[,pct.x := N.x/sum(N.x, na.rm = TRUE)]
tt1[,pct.y := N.y/sum(N.y, na.rm = TRUE)]
tt1 <- tt1[order(tt1$N.x,
                 decreasing = TRUE), ]

colnames(tt1) <- c("Kingdom",
                   "Number of OTUs in Nrf2 KO Mice",
                   "Number of OTUs in WT Mice",
                   "Percent of OTUs in Nrf2 KO Mice",
                   "Percent  of OTUs in WT Mice")

datatable(tt1,
          rownames = FALSE,
          caption = "Number of OTUs by Kingdom",
          class = "cell-border stripe",
          options = list(search = FALSE,
                         pageLength = nrow(t1))) %>%
  formatCurrency(columns = 2:3,
                 currency = "",
                 mark = ",",
                 digits = 0) %>%
  formatPercentage(columns = 4:5,
                   digits = 2)
```

# Merge the 2 data sets
```{r merge_batches}
pss <- merge_phyloseq(ps_nov2018,
                      ps_may2019)
tmp <- pss@sam_data

# Diet
meta <- data.table(Sample = rownames(pss@sam_data),
                   Diet = tmp$Diet)
meta$Diet <- gsub(x = meta$Diet, 
                  pattern = " Control",
                  replacement = "")
meta$Diet[!is.na(tmp$TREATMENT)] <- tmp$TREATMENT[!is.na(tmp$TREATMENT)]
meta$Diet <- gsub(x = meta$Diet,
                  pattern = "Control",
                  replacement = "AIN93M")
meta$Diet[is.na(meta$Diet)] <- "Pooled"
meta$Diet <- factor(meta$Diet,
                    levels = c("AIN93M",
                               "PEITC",
                               "Pooled"))

# Gemotype
meta$Genotype <- "Nrf2 KO"
meta$Genotype[is.na(tmp$Sex)] <- "Wild Type"
meta$Genotype <- factor(meta$Genotype,
                        levels = c("Wild Type",
                                   "Nrf2 KO"))

# Time
meta$Week <- as.numeric(as.character(tmp$Week)) - 4
meta$Week[meta$Genotype == "Wild Type"] <- as.numeric(gsub(x = tmp$WEEK[meta$Genotype == "Wild Type"],
                                                           pattern = "week ",
                                                           replacement = ""))
meta$Week <- paste("Week",
                   meta$Week)
meta$Week <- factor(meta$Week,
                    levels = c("Week 0",
                               "Week 1",
                               "Week 4",
                               "Week 5"))

# Mouse ID
meta$Mouse_Num <- as.numeric(as.character(tmp$MouseNum))
meta$Mouse_Num[meta$Genotype == "Wild Type"] <- as.numeric(substr(x = tmp$SAMPLE_NAME[meta$Genotype == "Wild Type"],
                                                                  start = 3, 
                                                                  stop = 3))

# Cage number
meta$Cage <- as.character(tmp$Cage)
meta$Cage[meta$Genotype == "Wild Type"] <- substr(x = tmp$SAMPLE_NAME[meta$Genotype == "Wild Type"],
                                                  start = 2, 
                                                  stop = 2)
meta$Cage <- factor(meta$Cage)

meta <- data.frame(meta)
rownames(meta) <- meta$Sample
meta

# Edit meta data
sample_data(pss) <- meta
pss@sam_data
```

```{r mapping_kingdom_combined, warning = FALSE, echo = FALSE, message = FALSE}
t1 <- data.table(table(tax_table(pss)[,
                                      "Kingdom"],
                       exclude = NULL))
t1$V1[is.na(t1$V1)] <- "Unknown"

t1[, pct := N/sum(N)]
setorder(t1, -N)

colnames(t1) <- c("Kingdom",
                  "Number of OTUs",
                  "Percent of OTUs")
datatable(t1,
          rownames = FALSE,
          caption = "Number of OTUs by Kingdom",
          class = "cell-border stripe",
          options = list(search = FALSE,
                         pageLength = nrow(t1))) %>%
  formatCurrency(columns = 2,
                 currency = "",
                 mark = ",",
                 digits = 0) %>%
  formatPercentage(columns = 3,
                   digits = 2)
```

```{r keep_bacteria}
dim(pss@otu_table@.Data)

# Remove OTU unmapped to Bacteria
ps0 <- subset_taxa(pss, 
                   Kingdom == "Bacteria")
dim(ps0@otu_table@.Data)
```

# OTU table (first 10 rows)
```{r otu_table, warning=FALSE,echo=FALSE,message=FALSE}
otu <- data.table(ps0@tax_table@.Data,
                  t(ps0@otu_table@.Data))

# Remove Species mapping'
otu$Species <- NULL

datatable(head(otu, 10),
          rownames = FALSE,
          caption = "Taxonomic  count table",
          class = "cell-border stripe",
          options = list(search = FALSE,
                         pageLength = 10)) %>%
  formatCurrency(columns = 7:36,
                 currency = "",
                 mark = ",",
                 digits = 0)
```

# Total counts per sample (i.e. sequencing depth)
```{r seq_depth_plotly, warning=FALSE,echo=FALSE,message=FALSE,fig.width=10,fig.height=5}
t1 <- colSums(otu[, 7:ncol(otu)])
t1 <- data.table(Sample = names(t1),
                 Total = t1)

smpl <- merge(meta,
              t1,
              by = "Sample")

p1 <- ggplot(smpl,
             aes(x = Sample,
                 y = Total,
                 fill =Diet,
                 colour = Week)) +
  facet_wrap(~ Genotype,
             scale = "free") +
  geom_bar(stat = "identity") +
  scale_x_discrete("Sample Name") +
  scale_y_continuous("Number of Reads") +
  scale_fill_discrete("Group") +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1)) 
ggplotly(p1)
```

```{r seq_depth_greyscale, fig.height = 6 , fig.width =8}
p1 <- ggplot(smpl,
             aes(x = Sample,
                 y = Total,
                 group =Diet,
                 fill = Diet)) +
  facet_wrap(~ Genotype + Week,
             scale = "free") +
  geom_bar(stat = "identity",
           color = "black") +
  scale_x_discrete("") +
  scale_y_continuous("Number of Reads") +
  scale_fill_grey("Treatment", 
                  start = 0.1, 
                  end = 1,
                  na.value = "red",
                  aesthetics = "fill") +
  theme_bw() + 
  theme(panel.border = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.title.x = element_blank(),
        # axis.text.x = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        # axis.ticks.x=element_blank(),
        legend.position = "top")

tiff(filename = "tmp/seq_depth_nov2018_may2019.tiff",
     height =6,
     width = 8,
     units = "in",
     res = 600,
     compression = "lzw+p")
print(p1)
graphics.off()

print(p1)
```


```{r phylum_mapping}
t2 <- data.table(table(tax_table(ps0)[, "Phylum"],
                                  exclude = NULL))
t2$V1[is.na(t2$V1)] <- "Unknown"
setorder(t2, -N)
t2[, pct := N/sum(N)]
setorder(t2, -N)

colnames(t2) <- c("Phylum",
                  "Number of OTUs",
                  "Percent of OTUs")

datatable(t2,
          rownames = FALSE,
          caption = "Number of Bacterial OTUs by Phylum",
          class = "cell-border stripe",
          options = list(search = FALSE,
                         pageLength = nrow(t2))) %>%
  formatCurrency(columns = 2,
                 currency = "",
                 mark = ",",
                 digits = 0) %>%
  formatPercentage(columns = 3,
                   digits = 2)

ps1 <- subset_taxa(ps0, 
                   !is.na(Phylum))
dim(ps1@otu_table@.Data)
```

# Remove Phylum
Remove:  
1. Unmapped OTUs ("Unknown").    
2. Cyanobacteria: aerobic, photosynthesizing  bacteria that probably got into the sample through food.  
NOTE: [Chloroflexi might be ok.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4192840/)  
  
```{r remove_phylums}
ps0 <- subset_taxa(ps0,
                   (Phylum %in% c("Unknown",
                                   "Cyanobacteria")))
```

# Richness (Alpha diversity)
Shannon index (aka Shannon enthrophy) is calculated as:  
H' = -sum(1 to R)p(i)ln(p(i)) 
When there is exactly 1 type of data (e.g. a single species in the sample), H'=0. The opposite scenario is when there are R>1 species present in the sample in the exact same amounts and H'=ln(R).  
  
Shannon's diversity index was calculated for each sample and ploted over time.

```{r shannon_vs_depth, fig.height = 5, fig.width = 6}
shannon.ndx <- estimate_richness(ps0,
                                 measures = "Shannon")

shannon.ndx <- data.table(Sample = rownames(shannon.ndx),
                          shannon.ndx)

smpl <- merge(smpl,
              shannon.ndx,
              by = "Sample")

p1 <- ggplot(smpl,
             aes(x = Total,
                 y = Shannon,
                 fill = Genotype,
                 shape = Week)) +
  geom_point(size = 2) +
  scale_shape_manual(breaks = unique(smpl$Week),
                     values = 21:24)

tiff(filename = "tmp/shannon_vs_depth_nov18_may19.tiff",
     height = 5,
     width = 6,
     units = "in",
     res = 600,
     compression = "lzw+p")
print(p1)
graphics.off()

ggplotly(p1)
```

Even though ***estimate_richness*** function does not adjust for the sequencing depth, there is no correlation between the index and the sample's sequecing depth. Proceed with the comparison.

# Shannon idex over time
```{r richness, fig.height = 5, fig.width = 8}
p1 <- plot_richness(ps0,
                    x = "Week", 
                    measures = "Shannon") +
  facet_wrap(~ Genotype,
             scale = "free_x") +
  geom_line(aes(group = paste0(Diet,
                               Cage,
                               Mouse_Num)),
            color = "black") +
  geom_point(aes(fill = Diet),
             shape = 21,
             size = 3,
             color = "black") +
  scale_x_discrete("") +
  theme(axis.text.x = element_text(angle = 30,
                                   hjust = 1,
                                   vjust = 1))

ggplotly(p = p1,
         tooltip = c("Mouse_Num",
                     "value"))

p1 <- p1 + 
  scale_fill_discrete("") +
  theme(legend.position = "top")

tiff(filename = "tmp/shannon_nov18_may19.tiff",
     height = 5,
     width = 8,
     units = "in",
     res = 600,
     compression = "lzw+p")
print(p1)
graphics.off()
```

The plot above suggests that the largest differences in alpha diversity (as measured by Shannon's index) are in genotype. THis was also confirmed in the 3rd study (September 2019 batch)

# Average Shannon Index
```{r avg_shannon_plot, fig.height = 4, fig.width = 6}
# Average shannon index by treatment group
tmp <- data.table(copy(smpl))

tmp[, mu := mean(Shannon),
    by = list(Diet,
              Genotype,
              Week)]
tmp[, sem := sd(Shannon)/sqrt(.N),
    by = list(Diet,
              Genotype,
              Week)]
tmp <- unique(tmp[, c("Diet",
                      "Genotype",
                      "Week",
                      "mu",
                      "sem")])

p1 <- ggplot(tmp,
             aes(x = Week,
                 y = mu,
                 ymin = mu - sem,
                 ymax = mu + sem,
                 fill = Diet,
                 group = Diet)) +
  facet_wrap(~ Genotype,
             scale = "free_x") +
  geom_errorbar(position = position_dodge(0.3),
                width = 0.4) +
  geom_line(position = position_dodge(0.3)) +
  geom_point(size = 3,
             shape = 21,
             position = position_dodge(0.3)) +
  scale_x_discrete("") +
  scale_y_continuous("Shannon Index") +
  scale_fill_grey("Treatment", 
                  start = 0, 
                  end = 1,
                  na.value = "red",
                  aesthetics = "fill") +
  theme_bw() + 
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        # panel.border = element_blank(), 
        axis.title.x = element_blank(),
        # axis.text.x = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        axis.ticks.x=element_blank(),
        legend.position = "top")

tiff(filename = "tmp/avg_shannon_nov18_may19.tiff",
     height = 4,
     width = 6,
     units = "in",
     res = 600,
     compression = "lzw+p")
print(p1)
graphics.off()

print(p1)
```

NOTE: cannot test diet effect because of unassigend pooled samples in the KO mice. For the same reason, cannot use mixed effects model on the whole data set. At best, testing genotype and time trend by assuming Week 4 in WT = Week 5 in KO.
```{r test_richness}
smpl$Timepoint <- as.numeric(smpl$Week)
smpl$Timepoint[smpl$Timepoint == 4] <- 3

m1 <- lm(Shannon ~ Timepoint*Genotype,
         data = smpl)
summary(m1)
```

No significant time trend; significantly higher alpha diversity in the Nrf2 KO mice. This is possibly a batch effect but we observed same trend in Study 3 (Crabbery and PEITC with both genotypes in the same study).

# Bacteriotides vs. Firmicutes
```{r bact_vs_firm, fig.height = 6, fig.width = 8, warning = FALSE}
counts_p <- counts_by_tax_rank(dt1 = otu,
                               aggr_by = "Phylum")

fb <- t(counts_p[Phylum %in% c("Bacteroidetes",
                               "Firmicutes"), -1])
fb <- data.table(Sample = rownames(fb),
                 Bacteroidetes = fb[, 1],
                 Firmicutes = fb[, 2])
fb <- data.table(merge(meta,
            fb,
            by = "Sample"))

p1 <- ggplot(fb,
             aes(x = log2(Bacteroidetes),
                 y = log2(Firmicutes),
                 fill = Genotype)) +
  geom_point(size = 2,
             color = "black",
             shape = 21)

p2 <- ggplot(fb,
             aes(x = log2(Bacteroidetes),
                 y = log2(Firmicutes),
                 fill = Week)) +
  geom_point(size = 2,
             color = "black",
             shape = 21)
p3 <- ggplot(fb,
             aes(x = log2(Bacteroidetes),
                 y = log2(Firmicutes),
                 fill = Diet)) +
  geom_point(size = 2,
             color = "black",
             shape = 21)

p4 <- ggplot(fb,
             aes(x = Week,
                 y = Bacteroidetes/Firmicutes,
                 fill = Diet,
                 group = paste0(Diet,
                               Cage,
                               Mouse_Num))) +
  facet_grid(~ Genotype,
             scale = "free_x") +
  geom_hline(yintercept = 1,
             linetype = "dashed") +
  geom_line(position = position_dodge(0.3)) +
  geom_point(size = 2,
             color = "black",
             shape = 21,
             position = position_dodge(0.3))  +
  theme_bw() + 
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        # panel.border = element_blank(), 
        axis.title.x = element_blank(),
        # axis.text.x = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        axis.ticks.x=element_blank(),
        legend.position = "top")

tiff(filename = "tmp/bact_vs_firm_nov18_may19.tiff",
     height = 6,
     width =8,
     units = "in",
     res = 600,
     compression = "lzw+p")
gridExtra::grid.arrange(p1, p2, p3, p4)
graphics.off()

gridExtra::grid.arrange(p1, p2, p3, p4)
```

```{r avg_bact_firm, fig.height = 4, fig.width = 6}
fb[, mu := mean(Bacteroidetes/Firmicutes),
   by = c("Diet",
          "Genotype",
          "Week")]
fb[, sem := sd(Bacteroidetes/Firmicutes)/sqrt(.N),
   by = c("Diet",
          "Genotype",
          "Week")]

mufb <- unique(fb[, c("Diet",
                      "Genotype",
                      "Week",
                      "mu",
                      "sem")])

p1 <- ggplot(mufb,
             aes(x = Week,
                 y = mu,
                 ymin = mu - sem,
                 ymax = mu + sem,
                 fill = Diet,
                 group = Diet)) +
  facet_wrap(~ Genotype,
             scale = "free_x") +
  geom_hline(yintercept = 1,
             linetype = "dashed") +
  geom_errorbar(position = position_dodge(0.3),
                width = 0.4) +
  geom_line(position = position_dodge(0.3)) +
  geom_point(size = 3,
             shape = 21,
             position = position_dodge(0.3)) +
  scale_x_discrete("") +
  scale_y_continuous("Bacteroidetes/Firmicutes") +
  scale_fill_grey("Treatment", 
                  start = 0, 
                  end = 1,
                  na.value = "red",
                  aesthetics = "fill") +
  theme_bw() + 
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        # panel.border = element_blank(), 
        axis.title.x = element_blank(),
        # axis.text.x = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        axis.ticks.x=element_blank(),
        legend.position = "top")

tiff(filename = "tmp/avg_bact_firm_nov18_may19.tiff",
     height = 4,
     width = 6,
     units = "in",
     res = 600,
     compression = "lzw+p")
print(p1)
graphics.off()

print(p1)

mufb[, est := paste0(round(mu, 2),
                     "(",
                     round(sem, 2),
                     ")")]

t1 <- dcast.data.table(mufb,
                       Genotype + Diet ~ Week,
                       value.var = "est")
datatable(t1,
          rownames = FALSE,
          caption = "Average Ratio of Bacteroidetes and Firmicutes",
          class = "cell-border stripe",
          options = list(search = FALSE,
                         pageLength = nrow(t1)))
```

# Update OTU table: excuded unknown phylum and Cyanobacteria
```{r update_otu}
otu <- data.table(ps0@tax_table@.Data,
                  t(ps0@otu_table@.Data))

# Remove Species mapping'
otu$Species <- NULL
dim(otu)
```

# 1. Phylum
## Counts at Phylum level
```{r counts_p, warning=FALSE,echo=FALSE,message=FALSE}
counts_p <- counts_by_tax_rank(dt1 = otu,
                               aggr_by = "Phylum")
setorder(counts_p, -`4A`)
datatable(counts_p,
          rownames = FALSE,
          caption = "Taxonomic  count table",
          class = "cell-border stripe",
          options = list(search = FALSE,
                         pageLength = nrow(counts_p))) %>%
  formatCurrency(columns = 2:ncol(counts_p),
                 currency = "",
                 mark = ",",
                 digits = 0)
```

## Relative abundance (%) at Phylum level
```{r ra_p, warning=FALSE,echo=FALSE,message=FALSE}
ra_p <- ra_by_tax_rank(counts = counts_p,
                       pct = FALSE,
                       digit = 4)

datatable(ra_p,
          rownames = FALSE,
          caption = "Taxonomic  count table",
          class = "cell-border stripe",
          options = list(search = FALSE,
                         pageLength = nrow(ra_p))) %>%
  formatPercentage(columns = 2:ncol(counts_p),
                   digits = 2)
```

## PCA at Phylum level
```{r pca_p_p0, fig.height = 5, fig.width = 6}
dt_pca <- t(ra_p[, 2:ncol(ra_p)])
colnames(dt_pca) <- ra_p$Phylum

dt_pca_p <- data.table(Sample = rownames(dt_pca),
                       dt_pca)
dt_pca_p <- merge(smpl,
                  dt_pca_p,
                  by = "Sample")

# Keep only the phylum with non-zero counts
tmp <- dt_pca_p[, 10:ncol(dt_pca_p)]
keep_p <- colnames(tmp)[colSums(tmp) > 0]
dt_pca <- dt_pca[, keep_p]

# m1 <- prcomp(dt_pca,
#              center = TRUE,
#              scale. = TRUE)

# m1 <- prcomp(dt_pca,
#              center = FALSE,
#              scale. = FALSE)

m1 <- prcomp(dt_pca,
             center = TRUE,
             scale. = FALSE)

summary(m1)

# Select PC-s to pliot (PC1 & PC2)
choices <- 1:2

# Add meta data
dt.scr <- data.table(m1$x[, choices])
dt.scr$Sample <- rownames(m1$x)

dt.scr <- merge(smpl,
                dt.scr,
                by = "Sample")
dt.scr

# Loadings, i.e. arrows (df.v)
dt.rot <- as.data.frame(m1$rotation[, choices])
dt.rot$feat <- rownames(dt.rot)
dt.rot <- data.table(dt.rot)
dt.rot

dt.load <- melt.data.table(dt.rot,
                           id.vars = "feat",
                           measure.vars = 1:2,
                           variable.name = "pc",
                           value.name = "loading")
dt.load$feat <- factor(dt.load$feat,
                       levels = unique(dt.load$feat))
# Plot loadings
p0 <- ggplot(data = dt.load,
             aes(x = feat,
                 y = loading)) +
  facet_wrap(~ pc,
             nrow = 2) +
  geom_bar(stat = "identity") +
  ggtitle("PC Loadings") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1))
p0

tiff(filename = "tmp/pc.1.2_loadings_phylum.tiff",
     height = 5,
     width = 6,
     units = 'in',
     res = 300,
     compression = "lzw+p")
print(p0)
graphics.off()

print(p0)
```

```{r pca_axesp}
# Axis labels
u.axis.labs <- paste(colnames(dt.rot)[1:2], 
                     sprintf('(%0.1f%% explained var.)', 
                             100*m1$sdev[choices]^2/sum(m1$sdev^2)))
u.axis.labs
```

```{r biplot_gen_p, fig.height = 8, fig.width = 8}
cntr <- data.table(aggregate(x = dt.scr$PC1,
                             by = list(dt.scr$Genotype),
                             FUN = "mean"),
                   aggregate(x = dt.scr$PC2,
                             by = list(dt.scr$Genotype),
                             FUN = "mean")$x)
colnames(cntr) <- c("Genotype",
                    "PC1",
                    "PC2")

# Based on Figure p0, keep only a few variables with high loadings in PC1 and PC2----
dt.rot[, rating:= (PC1)^2 + (PC2)^2]
setorder(dt.rot, -rating)

# Select top 5
dt.rot <- dt.rot[1:5, ]

# var.keep.ndx <- which(dt.rot$feat %in% c(...))
# Or select all
# var.keep.ndx <- 3:ncol(dt1)
# Use dt.rot[var.keep.ndx,] and dt.rot$feat[var.keep.ndx]
p1 <- ggplot(data = dt.rot,
             aes(x = PC1,
                 y = PC2)) +
  coord_equal() +
  geom_point(data = dt.scr,
             aes(fill = Genotype,
                 shape = factor(Timepoint)),
             size = 3,
             alpha = 0.5) +
  geom_segment(aes(x = 0,
                   y = 0,
                   xend = 0.2*PC1,
                   yend = 0.2*PC2),
               arrow = arrow(length = unit(1/2, 'picas')),
               # size = 1, 
               color = "black") +
  geom_text(aes(x = 0.22*PC1,
                y = 0.22*PC2,
                label = dt.rot$feat),
            # size = 5,
            hjust = 0.5) +
  scale_x_continuous(u.axis.labs[1]) +
  scale_y_continuous(u.axis.labs[2]) +
  scale_fill_manual(name = "Group",
                    breaks = c("Wild Type",
                               "Nrf2 KO"),
                    values = c("red",
                               "blue")) +
  scale_shape_manual(breaks = 1:3,
                     values = 21:23) +
  geom_label(data = cntr,
             aes(x = PC1,
                 y = PC2,
                 label = Genotype,
                 colour = Genotype),
             alpha = 0.5,
             size = 3) +
  scale_color_manual(guide = FALSE,
                     breaks = c("Wild Type",
                                "Nrf2 KO"),
                     values = c("red",
                                "blue")) +
  ggtitle("") +
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 20),
        legend.position = "none") 

tiff(filename = "tmp/phylum_biplot_grp.tiff",
     height = 8,
     width = 8,
     units = 'in',
     res = 300,
     compression = "lzw+p")
print(p1)
graphics.off()

ggplotly(p1)

# Generic biplot
biplot(m1)
```

# 2. Class
## Counts at Class level
```{r counts_c, warning=FALSE,echo=FALSE,message=FALSE}
counts_c <- counts_by_tax_rank(dt1 = otu,
                               aggr_by = "Class")
setorder(counts_c, -`4A`)
datatable(counts_c,
          rownames = FALSE,
          caption = "Taxonomic  count table",
          class = "cell-border stripe",
          options = list(search = FALSE,
                         pageLength = nrow(counts_c))) %>%
  formatCurrency(columns = 2:ncol(counts_c),
                 currency = "",
                 mark = ",",
                 digits = 0)
```

## Relative abundance (%) at Class level
```{r ra_c, warning = FALSE, echo = FALSE, message = FALSE}
ra_c <- ra_by_tax_rank(counts = counts_c,
                       pct = FALSE,
                       digit = 4)

datatable(ra_c,
          rownames = FALSE,
          caption = "Taxonomic  count table",
          class = "cell-border stripe",
          options = list(search = FALSE,
                         pageLength = nrow(ra_c))) %>%
  formatPercentage(columns = 2:ncol(counts_c),
                   digits = 2)
```

## PCA at Class level
```{r pca_c_p0, fig.height = 5, fig.width = 6}
dt_pca <- t(ra_c[, 2:ncol(ra_c)])
colnames(dt_pca) <- ra_c$Class

dt_pca_c <- data.table(Sample = rownames(dt_pca),
                       dt_pca)
dt_pca_c <- merge(smpl,
                  dt_pca_c,
                  by = "Sample")

# Keep only the Class with non-zero counts
tmp <- dt_pca_c[, 10:ncol(dt_pca_c)]
keep_c <- colnames(tmp)[colSums(tmp) > 0]
dt_pca <- dt_pca[, keep_c]

m1 <- prcomp(dt_pca,
             center = TRUE,
             scale. = FALSE)

summary(m1)

# Select PC-s to pliot (PC1 & PC2)
choices <- 1:2

# Add meta data
dt.scr <- data.table(m1$x[, choices])
dt.scr$Sample <- rownames(m1$x)

dt.scr <- merge(smpl,
                dt.scr,
                by = "Sample")
dt.scr

# Loadings, i.e. arrows (df.v)
dt.rot <- as.data.frame(m1$rotation[, choices])
dt.rot$feat <- rownames(dt.rot)
dt.rot <- data.table(dt.rot)
dt.rot

dt.load <- melt.data.table(dt.rot,
                           id.vars = "feat",
                           measure.vars = 1:2,
                           variable.name = "pc",
                           value.name = "loading")
dt.load$feat <- factor(dt.load$feat,
                       levels = unique(dt.load$feat))
# Plot loadings
p0 <- ggplot(data = dt.load,
             aes(x = feat,
                 y = loading)) +
  facet_wrap(~ pc,
             nrow = 2) +
  geom_bar(stat = "identity") +
  ggtitle("PC Loadings") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1))
p0

tiff(filename = "tmp/pc.1.2_loadings_Class.tiff",
     height = 5,
     width = 6,
     units = 'in',
     res = 300,
     compression = "lzw+p")
print(p0)
graphics.off()

print(p0)
```

```{r pca_axes_c}
# Axis labels
u.axis.labs <- paste(colnames(dt.rot)[1:2], 
                     sprintf('(%0.1f%% explained var.)', 
                             100*m1$sdev[choices]^2/sum(m1$sdev^2)))
u.axis.labs
```

```{r biplot_gen_c, fig.height = 8, fig.width = 8}
cntr <- data.table(aggregate(x = dt.scr$PC1,
                             by = list(dt.scr$Genotype),
                             FUN = "mean"),
                   aggregate(x = dt.scr$PC2,
                             by = list(dt.scr$Genotype),
                             FUN = "mean")$x)
colnames(cntr) <- c("Genotype",
                    "PC1",
                    "PC2")

# Based on Figure p0, keep only a few variables with high loadings in PC1 and PC2----
dt.rot[, rating:= (PC1)^2 + (PC2)^2]
setorder(dt.rot, -rating)

# Select top 8
dt.rot <- dt.rot[1:8, ]

# var.keep.ndx <- which(dt.rot$feat %in% c(...))
# Or select all
# var.keep.ndx <- 3:ncol(dt1)
# Use dt.rot[var.keep.ndx,] and dt.rot$feat[var.keep.ndx]
p1 <- ggplot(data = dt.rot,
             aes(x = PC1,
                 y = PC2)) +
  coord_equal() +
  geom_point(data = dt.scr,
             aes(fill = Genotype,
                 shape = factor(Timepoint)),
             size = 3,
             alpha = 0.5) +
  geom_segment(aes(x = 0,
                   y = 0,
                   xend = 0.2*PC1,
                   yend = 0.2*PC2),
               arrow = arrow(length = unit(1/2, 'picas')),
               # size = 1, 
               color = "black") +
  geom_text(aes(x = 0.22*PC1,
                y = 0.22*PC2,
                label = dt.rot$feat),
            # size = 5,
            hjust = 0.5) +
  scale_x_continuous(u.axis.labs[1]) +
  scale_y_continuous(u.axis.labs[2]) +
  scale_fill_manual(name = "Group",
                    breaks = c("Wild Type",
                               "Nrf2 KO"),
                    values = c("red",
                               "blue")) +
  scale_shape_manual(breaks = 1:3,
                     values = 21:23) +
  geom_label(data = cntr,
             aes(x = PC1,
                 y = PC2,
                 label = Genotype,
                 colour = Genotype),
             alpha = 0.5,
             size = 3) +
  scale_color_manual(guide = FALSE,
                     breaks = c("Wild Type",
                                "Nrf2 KO"),
                     values = c("red",
                                "blue")) +
  ggtitle("") +
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 20),
        legend.position = "none") 

tiff(filename = "tmp/class_biplot_gen.tiff",
     height = 8,
     width = 8,
     units = 'in',
     res = 300,
     compression = "lzw+p")
print(p1)
graphics.off()

ggplotly(p1)

# Generic biplot
biplot(m1)
```

## 3. Order

## 4. Family

## 5 Geus
## Counts at Genus level
```{r counts_g, warning = FALSE, echo = FALSE, message = FALSE}
counts_g <- counts_by_tax_rank(dt1 = otu,
                               aggr_by = "Genus")
tmp <- unique(otu[, Kingdom:Genus])
counts_g_all <- merge(tmp,
                      counts_g,
                      by = "Genus")

setorder(counts_g_all, -`4A`)
datatable(counts_g,
          rownames = FALSE,
          caption = "Taxonomic  count table",
          class = "cell-border stripe",
          options = list(search = FALSE,
                         pageLength = nrow(counts_g))) %>%
  formatCurrency(columns = 7:ncol(counts_g),
                 currency = "",
                 mark = ",",
                 digits = 0)
```

## Relative abundance (%) at Genus level
```{r ra_g, warning = FALSE, echo = FALSE, message = FALSE}
ra_g <- ra_by_tax_rank(counts = counts_g,
                       pct = FALSE,
                       digit = 4)

datatable(ra_g,
          rownames = FALSE,
          caption = "Taxonomic  count table",
          class = "cell-border stripe",
          options = list(search = FALSE,
                         pageLength = nrow(ra_g))) %>%
  formatPercentage(columns = 2:ncol(counts_g),
                   digits = 2)
```

# Session Information
```{r info,eval=TRUE}
sessionInfo()
```